doctra 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctra/cli/main.py +5 -12
- doctra/cli/utils.py +2 -3
- doctra/engines/image_restoration/docres_engine.py +6 -11
- doctra/engines/vlm/outlines_types.py +13 -9
- doctra/engines/vlm/service.py +4 -2
- doctra/exporters/excel_writer.py +89 -0
- doctra/exporters/html_writer.py +206 -1
- doctra/parsers/enhanced_pdf_parser.py +124 -31
- doctra/parsers/structured_pdf_parser.py +58 -15
- doctra/parsers/table_chart_extractor.py +290 -284
- doctra/ui/app.py +39 -960
- doctra/ui/docres_ui.py +338 -0
- doctra/ui/docres_wrapper.py +120 -0
- doctra/ui/enhanced_parser_ui.py +483 -0
- doctra/ui/full_parse_ui.py +539 -0
- doctra/ui/tables_charts_ui.py +445 -0
- doctra/ui/ui_helpers.py +435 -0
- doctra/utils/progress.py +7 -7
- doctra/utils/structured_utils.py +5 -2
- doctra/version.py +1 -1
- {doctra-0.4.0.dist-info → doctra-0.4.2.dist-info}/METADATA +1 -1
- {doctra-0.4.0.dist-info → doctra-0.4.2.dist-info}/RECORD +25 -19
- {doctra-0.4.0.dist-info → doctra-0.4.2.dist-info}/WHEEL +0 -0
- {doctra-0.4.0.dist-info → doctra-0.4.2.dist-info}/licenses/LICENSE +0 -0
- {doctra-0.4.0.dist-info → doctra-0.4.2.dist-info}/top_level.txt +0 -0
doctra/cli/main.py
CHANGED
@@ -9,6 +9,7 @@ detection results, and analyze document structure from the command line.
|
|
9
9
|
import click
|
10
10
|
import os
|
11
11
|
import sys
|
12
|
+
import traceback
|
12
13
|
from pathlib import Path
|
13
14
|
from typing import Optional
|
14
15
|
|
@@ -25,6 +26,10 @@ except ImportError:
|
|
25
26
|
from doctra.parsers.enhanced_pdf_parser import EnhancedPDFParser
|
26
27
|
from doctra.parsers.table_chart_extractor import ChartTablePDFParser
|
27
28
|
|
29
|
+
# Import additional modules
|
30
|
+
from doctra.engines.layout.paddle_layout import PaddleLayoutEngine
|
31
|
+
from doctra.engines.image_restoration import DocResEngine
|
32
|
+
|
28
33
|
|
29
34
|
@click.group(invoke_without_command=True)
|
30
35
|
@click.pass_context
|
@@ -247,7 +252,6 @@ def parse(pdf_path: Path, output_dir: Optional[Path], use_vlm: bool,
|
|
247
252
|
except Exception as e:
|
248
253
|
click.echo(f"❌ Error initializing parser: {e}", err=True)
|
249
254
|
if verbose:
|
250
|
-
import traceback
|
251
255
|
click.echo(traceback.format_exc(), err=True)
|
252
256
|
sys.exit(1)
|
253
257
|
|
@@ -271,7 +275,6 @@ def parse(pdf_path: Path, output_dir: Optional[Path], use_vlm: bool,
|
|
271
275
|
except Exception as e:
|
272
276
|
click.echo(f"❌ Error during parsing: {e}", err=True)
|
273
277
|
if verbose:
|
274
|
-
import traceback
|
275
278
|
click.echo(traceback.format_exc(), err=True)
|
276
279
|
sys.exit(1)
|
277
280
|
finally:
|
@@ -394,7 +397,6 @@ def enhance(pdf_path: Path, output_dir: Optional[Path], restoration_task: str,
|
|
394
397
|
except Exception as e:
|
395
398
|
click.echo(f"❌ Error initializing enhanced parser: {e}", err=True)
|
396
399
|
if verbose:
|
397
|
-
import traceback
|
398
400
|
click.echo(traceback.format_exc(), err=True)
|
399
401
|
sys.exit(1)
|
400
402
|
|
@@ -418,7 +420,6 @@ def enhance(pdf_path: Path, output_dir: Optional[Path], restoration_task: str,
|
|
418
420
|
except Exception as e:
|
419
421
|
click.echo(f"❌ Error during enhanced parsing: {e}", err=True)
|
420
422
|
if verbose:
|
421
|
-
import traceback
|
422
423
|
click.echo(traceback.format_exc(), err=True)
|
423
424
|
sys.exit(1)
|
424
425
|
finally:
|
@@ -526,7 +527,6 @@ def charts(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
|
|
526
527
|
except Exception as e:
|
527
528
|
click.echo(f"❌ Error during chart extraction: {e}", err=True)
|
528
529
|
if verbose:
|
529
|
-
import traceback
|
530
530
|
click.echo(traceback.format_exc(), err=True)
|
531
531
|
sys.exit(1)
|
532
532
|
|
@@ -604,7 +604,6 @@ def tables(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
|
|
604
604
|
except Exception as e:
|
605
605
|
click.echo(f"❌ Error during table extraction: {e}", err=True)
|
606
606
|
if verbose:
|
607
|
-
import traceback
|
608
607
|
click.echo(traceback.format_exc(), err=True)
|
609
608
|
sys.exit(1)
|
610
609
|
|
@@ -683,7 +682,6 @@ def both(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
|
|
683
682
|
except Exception as e:
|
684
683
|
click.echo(f"❌ Error during extraction: {e}", err=True)
|
685
684
|
if verbose:
|
686
|
-
import traceback
|
687
685
|
click.echo(traceback.format_exc(), err=True)
|
688
686
|
sys.exit(1)
|
689
687
|
|
@@ -772,7 +770,6 @@ def visualize(pdf_path: Path, pages: int, columns: int, width: int,
|
|
772
770
|
except Exception as e:
|
773
771
|
click.echo(f"❌ Error creating visualization: {e}", err=True)
|
774
772
|
if verbose:
|
775
|
-
import traceback
|
776
773
|
click.echo(traceback.format_exc(), err=True)
|
777
774
|
sys.exit(1)
|
778
775
|
|
@@ -805,7 +802,6 @@ def analyze(pdf_path: Path, dpi: int, min_score: float, layout_model: str, verbo
|
|
805
802
|
click.echo(f"🔍 Analyzing: {pdf_path.name}")
|
806
803
|
|
807
804
|
# Create layout engine for analysis only
|
808
|
-
from doctra.engines.layout.paddle_layout import PaddleLayoutEngine
|
809
805
|
|
810
806
|
if verbose:
|
811
807
|
click.echo(f" Using model: {layout_model}")
|
@@ -903,7 +899,6 @@ def analyze(pdf_path: Path, dpi: int, min_score: float, layout_model: str, verbo
|
|
903
899
|
except Exception as e:
|
904
900
|
click.echo(f"❌ Error analyzing PDF: {e}", err=True)
|
905
901
|
if verbose:
|
906
|
-
import traceback
|
907
902
|
click.echo(traceback.format_exc(), err=True)
|
908
903
|
sys.exit(1)
|
909
904
|
|
@@ -922,7 +917,6 @@ def info():
|
|
922
917
|
click.echo("=" * 50)
|
923
918
|
|
924
919
|
# Check Python version
|
925
|
-
import sys
|
926
920
|
python_version = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"
|
927
921
|
click.echo(f"Python version: {python_version}")
|
928
922
|
|
@@ -1003,7 +997,6 @@ def info():
|
|
1003
997
|
# DocRes information
|
1004
998
|
click.echo("\nDocRes Image Restoration:")
|
1005
999
|
try:
|
1006
|
-
from doctra.engines.image_restoration import DocResEngine
|
1007
1000
|
docres = DocResEngine()
|
1008
1001
|
click.echo(f" ✅ DocRes available - {len(docres.get_supported_tasks())} restoration tasks")
|
1009
1002
|
click.echo(" Tasks: dewarping, deshadowing, appearance, deblurring, binarization, end2end")
|
doctra/cli/utils.py
CHANGED
@@ -7,8 +7,10 @@ different CLI commands.
|
|
7
7
|
|
8
8
|
import click
|
9
9
|
import sys
|
10
|
+
import traceback
|
10
11
|
from typing import Optional, Dict, Any
|
11
12
|
from pathlib import Path
|
13
|
+
from doctra.utils.progress import create_beautiful_progress_bar, create_notebook_friendly_bar
|
12
14
|
|
13
15
|
|
14
16
|
def validate_vlm_config(use_vlm: bool, vlm_api_key: Optional[str]) -> None:
|
@@ -58,7 +60,6 @@ def handle_exception(e: Exception, verbose: bool = False) -> None:
|
|
58
60
|
"""
|
59
61
|
click.echo(f"❌ Error: {e}", err=True)
|
60
62
|
if verbose:
|
61
|
-
import traceback
|
62
63
|
click.echo(traceback.format_exc(), err=True)
|
63
64
|
sys.exit(1)
|
64
65
|
|
@@ -271,8 +272,6 @@ def create_progress_callback(description: str, total: int):
|
|
271
272
|
:return: Callable progress callback function that takes an integer
|
272
273
|
representing the number of completed items
|
273
274
|
"""
|
274
|
-
import sys
|
275
|
-
from doctra.utils.progress import create_beautiful_progress_bar, create_notebook_friendly_bar
|
276
275
|
|
277
276
|
# Enhanced environment detection
|
278
277
|
is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
|
@@ -18,6 +18,8 @@ import sys
|
|
18
18
|
import cv2
|
19
19
|
import numpy as np
|
20
20
|
import torch
|
21
|
+
import tempfile
|
22
|
+
import time
|
21
23
|
from pathlib import Path
|
22
24
|
from typing import Union, List, Tuple, Optional, Dict, Any
|
23
25
|
|
@@ -85,12 +87,12 @@ def load_docres_weights_from_hf():
|
|
85
87
|
if is_notebook:
|
86
88
|
progress_bar = create_notebook_friendly_bar(
|
87
89
|
total=2,
|
88
|
-
desc="
|
90
|
+
desc="Downloading DocRes models from Hugging Face Hub"
|
89
91
|
)
|
90
92
|
else:
|
91
93
|
progress_bar = create_beautiful_progress_bar(
|
92
94
|
total=2,
|
93
|
-
desc="
|
95
|
+
desc="Downloading DocRes models from Hugging Face Hub",
|
94
96
|
leave=True
|
95
97
|
)
|
96
98
|
|
@@ -308,8 +310,6 @@ class DocResEngine:
|
|
308
310
|
|
309
311
|
def _run_single_task(self, img_array: np.ndarray, task: str, save_prompts: bool) -> Tuple[np.ndarray, Dict]:
|
310
312
|
"""Run a single restoration task"""
|
311
|
-
import tempfile
|
312
|
-
import time
|
313
313
|
|
314
314
|
# Create temporary file for inference
|
315
315
|
with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp_file:
|
@@ -322,7 +322,6 @@ class DocResEngine:
|
|
322
322
|
os.chdir(str(docres_dir))
|
323
323
|
|
324
324
|
# Set global DEVICE variable that DocRes inference expects
|
325
|
-
import torch
|
326
325
|
import inference # Import the inference module to set its global DEVICE
|
327
326
|
inference.DEVICE = self.device
|
328
327
|
|
@@ -364,8 +363,6 @@ class DocResEngine:
|
|
364
363
|
|
365
364
|
def _run_end2end_pipeline(self, img_array: np.ndarray, save_prompts: bool) -> Tuple[np.ndarray, Dict]:
|
366
365
|
"""Run the end2end pipeline: dewarping → deshadowing → appearance"""
|
367
|
-
import tempfile
|
368
|
-
import time
|
369
366
|
|
370
367
|
intermediate_steps = {}
|
371
368
|
|
@@ -374,7 +371,6 @@ class DocResEngine:
|
|
374
371
|
os.chdir(str(docres_dir))
|
375
372
|
|
376
373
|
# Set global DEVICE variable that DocRes inference expects
|
377
|
-
import torch
|
378
374
|
import inference # Import the inference module to set its global DEVICE
|
379
375
|
inference.DEVICE = self.device
|
380
376
|
|
@@ -482,7 +478,6 @@ class DocResEngine:
|
|
482
478
|
"""
|
483
479
|
try:
|
484
480
|
from PIL import Image
|
485
|
-
import numpy as np
|
486
481
|
from doctra.utils.pdf_io import render_pdf_to_images
|
487
482
|
|
488
483
|
# Generate output path if not provided
|
@@ -510,12 +505,12 @@ class DocResEngine:
|
|
510
505
|
if is_notebook:
|
511
506
|
progress_bar = create_notebook_friendly_bar(
|
512
507
|
total=len(pil_pages),
|
513
|
-
desc="
|
508
|
+
desc="Processing pages"
|
514
509
|
)
|
515
510
|
else:
|
516
511
|
progress_bar = create_beautiful_progress_bar(
|
517
512
|
total=len(pil_pages),
|
518
|
-
desc="
|
513
|
+
desc="Processing pages",
|
519
514
|
leave=True
|
520
515
|
)
|
521
516
|
|
@@ -1,17 +1,19 @@
|
|
1
|
-
from pydantic import BaseModel
|
1
|
+
from pydantic import BaseModel, Field
|
2
2
|
|
3
3
|
class Chart(BaseModel):
|
4
4
|
"""
|
5
5
|
Structured representation of a chart extracted from an image.
|
6
6
|
|
7
|
-
|
8
|
-
using VLM (Vision Language Model) processing.
|
7
|
+
Includes a title, a short description, column headers, and data rows
|
8
|
+
identified using VLM (Vision Language Model) processing.
|
9
9
|
|
10
|
-
:param title: Title or caption of the chart
|
10
|
+
:param title: Title or caption of the chart (max 31 characters)
|
11
|
+
:param description: Short description of the chart (max 300 characters)
|
11
12
|
:param headers: Column headers for the chart data
|
12
13
|
:param rows: Data rows containing the chart values
|
13
14
|
"""
|
14
|
-
title: str
|
15
|
+
title: str = Field(max_length=31)
|
16
|
+
description: str = Field(max_length=300)
|
15
17
|
headers: list[str]
|
16
18
|
rows: list[list[str]]
|
17
19
|
|
@@ -19,13 +21,15 @@ class Table(BaseModel):
|
|
19
21
|
"""
|
20
22
|
Structured representation of a table extracted from an image.
|
21
23
|
|
22
|
-
|
23
|
-
using VLM (Vision Language Model) processing.
|
24
|
+
Includes a title, a short description, column headers, and data rows
|
25
|
+
identified using VLM (Vision Language Model) processing.
|
24
26
|
|
25
|
-
:param title: Title or caption of the table
|
27
|
+
:param title: Title or caption of the table (max 31 characters)
|
28
|
+
:param description: Short description of the table (max 300 characters)
|
26
29
|
:param headers: Column headers for the table data
|
27
30
|
:param rows: Data rows containing the table values
|
28
31
|
"""
|
29
|
-
title: str
|
32
|
+
title: str = Field(max_length=31)
|
33
|
+
description: str = Field(max_length=300)
|
30
34
|
headers: list[str]
|
31
35
|
rows: list[list[str]]
|
doctra/engines/vlm/service.py
CHANGED
@@ -73,7 +73,7 @@ class VLMStructuredExtractor:
|
|
73
73
|
Extract structured chart data from an image.
|
74
74
|
|
75
75
|
:param image_path: Path to the chart image file
|
76
|
-
:return: Chart object containing extracted title, headers, and data rows
|
76
|
+
:return: Chart object containing extracted title, description, headers, and data rows
|
77
77
|
:raises Exception: If image processing or VLM extraction fails
|
78
78
|
"""
|
79
79
|
prompt_text = (
|
@@ -81,6 +81,7 @@ class VLMStructuredExtractor:
|
|
81
81
|
"If the title is not present in the image, generate a suitable title. "
|
82
82
|
"Ensure that the table represents the data from the chart accurately."
|
83
83
|
"The number of columns in the headers must match the number of columns in each row."
|
84
|
+
"Also provide a short description (max 300 characters) of the chart."
|
84
85
|
)
|
85
86
|
return self._call(prompt_text, image_path, Chart)
|
86
87
|
|
@@ -89,7 +90,7 @@ class VLMStructuredExtractor:
|
|
89
90
|
Extract structured table data from an image.
|
90
91
|
|
91
92
|
:param image_path: Path to the table image file
|
92
|
-
:return: Table object containing extracted title, headers, and data rows
|
93
|
+
:return: Table object containing extracted title, description, headers, and data rows
|
93
94
|
:raises Exception: If image processing or VLM extraction fails
|
94
95
|
"""
|
95
96
|
prompt_text = (
|
@@ -97,5 +98,6 @@ class VLMStructuredExtractor:
|
|
97
98
|
"Provide the headers and rows of the table, ensuring accuracy in the extraction. "
|
98
99
|
"If the title is not present in the image, generate a suitable title."
|
99
100
|
"The number of columns in the headers must match the number of columns in each row."
|
101
|
+
"Also provide a short description (max 300 characters) of the table."
|
100
102
|
)
|
101
103
|
return self._call(prompt_text, image_path, Table)
|
doctra/exporters/excel_writer.py
CHANGED
@@ -5,6 +5,7 @@ from typing import Dict, Any, List, Set
|
|
5
5
|
import pandas as pd # pip install pandas openpyxl
|
6
6
|
from openpyxl.styles import PatternFill, Font, Alignment
|
7
7
|
from openpyxl.utils import get_column_letter
|
8
|
+
from openpyxl.worksheet.hyperlink import Hyperlink
|
8
9
|
|
9
10
|
_INVALID_SHEET_CHARS = r'[:\\/*?\[\]]' # Excel-invalid characters
|
10
11
|
_MAX_SHEET_LEN = 31
|
@@ -85,6 +86,61 @@ def _autosize_columns(ws, df: pd.DataFrame) -> None:
|
|
85
86
|
ws.column_dimensions[get_column_letter(i)].width = min(max(10, max_len + 2), 60)
|
86
87
|
|
87
88
|
|
89
|
+
def _style_summary_sheet(ws, df: pd.DataFrame, sheet_mapping: dict = None) -> None:
|
90
|
+
"""
|
91
|
+
Apply special styling to the summary sheet with text wrapping for descriptions.
|
92
|
+
Add hyperlinks to table titles that link to their corresponding sheets.
|
93
|
+
|
94
|
+
:param ws: OpenPyXL worksheet object to style
|
95
|
+
:param df: Pandas DataFrame containing the summary data
|
96
|
+
:param sheet_mapping: Dictionary mapping table titles to their sheet names
|
97
|
+
:return: None
|
98
|
+
"""
|
99
|
+
# Style header row
|
100
|
+
_style_header(ws, ncols=df.shape[1])
|
101
|
+
|
102
|
+
# Apply text wrapping to all data cells
|
103
|
+
wrap_alignment = Alignment(wrap_text=True, vertical="top")
|
104
|
+
|
105
|
+
# Apply wrapping to all data rows (skip header row)
|
106
|
+
for row_idx in range(2, len(df) + 2): # Start from row 2 (after header)
|
107
|
+
for col_idx in range(1, df.shape[1] + 1):
|
108
|
+
cell = ws.cell(row=row_idx, column=col_idx)
|
109
|
+
cell.alignment = wrap_alignment
|
110
|
+
|
111
|
+
# Add hyperlink to table title column (column A)
|
112
|
+
if col_idx == 1 and sheet_mapping: # Table Title column
|
113
|
+
table_title = cell.value
|
114
|
+
if table_title and table_title in sheet_mapping:
|
115
|
+
sheet_name = sheet_mapping[table_title]
|
116
|
+
|
117
|
+
# Create hyperlink to the sheet using proper Excel format
|
118
|
+
# Escape sheet name if it contains spaces or special characters
|
119
|
+
if ' ' in sheet_name or any(char in sheet_name for char in ['[', ']', '*', '?', ':', '\\', '/']):
|
120
|
+
hyperlink_ref = f"#'{sheet_name}'!A1"
|
121
|
+
else:
|
122
|
+
hyperlink_ref = f"#{sheet_name}!A1"
|
123
|
+
|
124
|
+
# Use Hyperlink class with proper parameters
|
125
|
+
cell.hyperlink = Hyperlink(ref=hyperlink_ref, target=hyperlink_ref)
|
126
|
+
# Style the hyperlink
|
127
|
+
cell.font = Font(color="0000FF", underline="single")
|
128
|
+
|
129
|
+
# Set specific column widths for summary sheet
|
130
|
+
# Table Title column - narrower
|
131
|
+
ws.column_dimensions['A'].width = 30
|
132
|
+
# Description column - wider to accommodate wrapped text
|
133
|
+
ws.column_dimensions['B'].width = 60
|
134
|
+
# Page column - narrow for page numbers
|
135
|
+
ws.column_dimensions['C'].width = 10
|
136
|
+
# Type column - narrow for Table/Chart
|
137
|
+
ws.column_dimensions['D'].width = 12
|
138
|
+
|
139
|
+
# Set row heights to accommodate wrapped text
|
140
|
+
for row_idx in range(2, len(df) + 2):
|
141
|
+
ws.row_dimensions[row_idx].height = 60 # Allow for multiple lines
|
142
|
+
|
143
|
+
|
88
144
|
def _normalize_data(headers: List[str], rows: List[List]) -> tuple[List[str], List[List]]:
|
89
145
|
"""
|
90
146
|
Normalize headers and rows to ensure consistent dimensions.
|
@@ -159,6 +215,31 @@ def write_structured_excel(excel_path: str, items: List[Dict[str, Any]]) -> str
|
|
159
215
|
taken: Set[str] = set()
|
160
216
|
|
161
217
|
with pd.ExcelWriter(excel_path, engine="openpyxl", mode="w") as writer:
|
218
|
+
# Create summary sheet first
|
219
|
+
summary_data = []
|
220
|
+
sheet_mapping = {} # Map table titles to their sheet names
|
221
|
+
|
222
|
+
for item in valid_items:
|
223
|
+
title = item.get("title") or "Untitled"
|
224
|
+
description = item.get("description") or "No description available"
|
225
|
+
page_number = item.get("page", "Unknown")
|
226
|
+
item_type = item.get("type", "Table") # Default to "Table" if not specified
|
227
|
+
|
228
|
+
|
229
|
+
summary_data.append({
|
230
|
+
"Table Title": title,
|
231
|
+
"Description": description,
|
232
|
+
"Page": page_number,
|
233
|
+
"Type": item_type
|
234
|
+
})
|
235
|
+
|
236
|
+
# Create summary sheet first (but without hyperlinks initially)
|
237
|
+
if summary_data:
|
238
|
+
summary_df = pd.DataFrame(summary_data)
|
239
|
+
summary_df.to_excel(writer, sheet_name="Table Summary", index=False)
|
240
|
+
taken.add("Table Summary")
|
241
|
+
|
242
|
+
# Process individual table sheets to build sheet mapping
|
162
243
|
for item in valid_items:
|
163
244
|
try:
|
164
245
|
title = item.get("title") or "Untitled"
|
@@ -166,6 +247,9 @@ def write_structured_excel(excel_path: str, items: List[Dict[str, Any]]) -> str
|
|
166
247
|
rows = item.get("rows") or []
|
167
248
|
|
168
249
|
sheet_name = _safe_sheet_name(title, taken)
|
250
|
+
|
251
|
+
# Add to sheet mapping for hyperlinks
|
252
|
+
sheet_mapping[title] = sheet_name
|
169
253
|
|
170
254
|
# Normalize data to handle mismatched dimensions
|
171
255
|
normalized_headers, normalized_rows = _normalize_data(headers, rows)
|
@@ -194,4 +278,9 @@ def write_structured_excel(excel_path: str, items: List[Dict[str, Any]]) -> str
|
|
194
278
|
print(f"Error processing item '{item.get('title', 'Unknown')}': {e}")
|
195
279
|
continue
|
196
280
|
|
281
|
+
# Now add hyperlinks to the summary sheet (after all sheets are created)
|
282
|
+
if summary_data and sheet_mapping:
|
283
|
+
summary_ws = writer.sheets["Table Summary"]
|
284
|
+
_style_summary_sheet(summary_ws, summary_df, sheet_mapping)
|
285
|
+
|
197
286
|
return excel_path
|
doctra/exporters/html_writer.py
CHANGED
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
2
2
|
import os
|
3
3
|
import re
|
4
4
|
import base64
|
5
|
-
from typing import List, Dict, Any
|
5
|
+
from typing import List, Dict, Any, Optional
|
6
6
|
from markdown_it import MarkdownIt
|
7
7
|
|
8
8
|
|
@@ -64,6 +64,114 @@ def _process_image_paths(md_content: str, out_dir: str) -> str:
|
|
64
64
|
return processed_content
|
65
65
|
|
66
66
|
|
67
|
+
def write_html_from_lines(html_lines: List[str], out_dir: str, filename: str = "result.html") -> str:
|
68
|
+
"""
|
69
|
+
Convert HTML lines directly into a single HTML file and save it.
|
70
|
+
|
71
|
+
This function is used when VLM is enabled to ensure proper HTML table formatting
|
72
|
+
instead of markdown-to-HTML conversion.
|
73
|
+
|
74
|
+
:param html_lines: List of HTML strings to join into a single file
|
75
|
+
:param out_dir: Directory where the HTML file will be saved
|
76
|
+
:param filename: Name of the HTML file (default: "result.html")
|
77
|
+
:return: The absolute path of the written HTML file
|
78
|
+
"""
|
79
|
+
os.makedirs(out_dir, exist_ok=True)
|
80
|
+
|
81
|
+
# Join HTML lines and clean up excessive blank lines
|
82
|
+
html_content = "\n".join(html_lines).strip() + "\n"
|
83
|
+
html_content = re.sub(r"\n{3,}", "\n\n", html_content)
|
84
|
+
|
85
|
+
# Process image paths to convert relative paths to absolute paths or base64
|
86
|
+
html_content = _process_image_paths(html_content, out_dir)
|
87
|
+
|
88
|
+
# Always apply table styling to ensure all tables are properly formatted
|
89
|
+
html_content = _add_table_styling(html_content)
|
90
|
+
|
91
|
+
# Create complete HTML document with modern styling
|
92
|
+
html_document = f"""<!DOCTYPE html>
|
93
|
+
<html lang="en">
|
94
|
+
<head>
|
95
|
+
<meta charset="UTF-8">
|
96
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
97
|
+
<title>Document Analysis Results</title>
|
98
|
+
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700;800&display=swap" rel="stylesheet">
|
99
|
+
<style>
|
100
|
+
{_get_css_styles()}
|
101
|
+
</style>
|
102
|
+
</head>
|
103
|
+
<body>
|
104
|
+
<button class="theme-toggle" onclick="toggleTheme()" title="Toggle dark mode"></button>
|
105
|
+
<div class="container">
|
106
|
+
<header class="header">
|
107
|
+
<div class="header-content">
|
108
|
+
<div class="header-text">
|
109
|
+
<h1>Document Analysis Results</h1>
|
110
|
+
<p class="subtitle">Intelligent Document Processing & Analysis</p>
|
111
|
+
</div>
|
112
|
+
<div class="header-badge">
|
113
|
+
Generated by Doctra
|
114
|
+
</div>
|
115
|
+
</div>
|
116
|
+
</header>
|
117
|
+
<main class="content">
|
118
|
+
{html_content}
|
119
|
+
</main>
|
120
|
+
<footer class="footer">
|
121
|
+
<div class="footer-content">
|
122
|
+
<div class="footer-brand">Doctra</div>
|
123
|
+
<div class="footer-info">
|
124
|
+
<span>Intelligent Document Processing</span>
|
125
|
+
<a href="https://github.com/AdemBoukhris457/Doctra" target="_blank">GitHub</a>
|
126
|
+
</div>
|
127
|
+
</div>
|
128
|
+
</footer>
|
129
|
+
</div>
|
130
|
+
<script>
|
131
|
+
// Theme toggle functionality
|
132
|
+
function toggleTheme() {{
|
133
|
+
const body = document.body;
|
134
|
+
const currentTheme = body.getAttribute('data-theme');
|
135
|
+
const newTheme = currentTheme === 'dark' ? 'light' : 'dark';
|
136
|
+
|
137
|
+
body.setAttribute('data-theme', newTheme);
|
138
|
+
localStorage.setItem('doctra-theme', newTheme);
|
139
|
+
|
140
|
+
// Add smooth transition
|
141
|
+
body.style.transition = 'all 0.3s ease';
|
142
|
+
setTimeout(() => {{
|
143
|
+
body.style.transition = '';
|
144
|
+
}}, 300);
|
145
|
+
}}
|
146
|
+
|
147
|
+
// Load saved theme on page load
|
148
|
+
document.addEventListener('DOMContentLoaded', function() {{
|
149
|
+
const savedTheme = localStorage.getItem('doctra-theme') || 'light';
|
150
|
+
document.body.setAttribute('data-theme', savedTheme);
|
151
|
+
}});
|
152
|
+
|
153
|
+
// Add smooth scroll behavior
|
154
|
+
document.documentElement.style.scrollBehavior = 'smooth';
|
155
|
+
|
156
|
+
// Add loading animation
|
157
|
+
window.addEventListener('load', function() {{
|
158
|
+
document.body.style.opacity = '0';
|
159
|
+
document.body.style.transition = 'opacity 0.5s ease';
|
160
|
+
setTimeout(() => {{
|
161
|
+
document.body.style.opacity = '1';
|
162
|
+
}}, 100);
|
163
|
+
}});
|
164
|
+
</script>
|
165
|
+
</body>
|
166
|
+
</html>"""
|
167
|
+
|
168
|
+
html_path = os.path.join(out_dir, filename)
|
169
|
+
with open(html_path, "w", encoding="utf-8") as f:
|
170
|
+
f.write(html_document)
|
171
|
+
|
172
|
+
return os.path.abspath(html_path)
|
173
|
+
|
174
|
+
|
67
175
|
def write_html(md_lines: List[str], out_dir: str, filename: str = "result.html") -> str:
|
68
176
|
"""
|
69
177
|
Convert collected Markdown lines into a single HTML file and save it.
|
@@ -414,6 +522,54 @@ def _create_html_table(headers: List[str], rows: List[List]) -> str:
|
|
414
522
|
"""
|
415
523
|
|
416
524
|
|
525
|
+
def render_html_table(
|
526
|
+
headers: List[str] | None,
|
527
|
+
rows: List[List[str]] | None,
|
528
|
+
title: Optional[str] = None,
|
529
|
+
) -> str:
|
530
|
+
"""
|
531
|
+
Render an HTML table from headers, rows, and optional title.
|
532
|
+
|
533
|
+
Creates a properly formatted HTML table with headers, data rows,
|
534
|
+
and optional title. This is used for VLM-extracted tables to ensure
|
535
|
+
they display as proper HTML tables instead of markdown.
|
536
|
+
|
537
|
+
:param headers: List of column headers (optional, will be auto-generated if None)
|
538
|
+
:param rows: List of data rows, where each row is a list of cell values
|
539
|
+
:param title: Optional title to display above the table
|
540
|
+
:return: Formatted HTML table string
|
541
|
+
"""
|
542
|
+
headers = headers or []
|
543
|
+
rows = rows or []
|
544
|
+
|
545
|
+
if not headers and not rows:
|
546
|
+
return "<p class='no-data'>No data available</p>"
|
547
|
+
|
548
|
+
# Determine width
|
549
|
+
width = len(headers) if headers else (max((len(r) for r in rows), default=1))
|
550
|
+
|
551
|
+
# Generate headers if not provided
|
552
|
+
if not headers:
|
553
|
+
headers = [f"Column {i+1}" for i in range(width)]
|
554
|
+
|
555
|
+
# Normalize data to handle mismatched dimensions
|
556
|
+
normalized_headers, normalized_rows = _normalize_data(headers, rows)
|
557
|
+
|
558
|
+
# Create HTML table
|
559
|
+
table_html = _create_html_table(normalized_headers, normalized_rows)
|
560
|
+
|
561
|
+
# Add title if provided
|
562
|
+
if title:
|
563
|
+
return f"""
|
564
|
+
<div class="table-section">
|
565
|
+
<h3 class="table-title">{_escape_html(title)}</h3>
|
566
|
+
{table_html}
|
567
|
+
</div>
|
568
|
+
"""
|
569
|
+
else:
|
570
|
+
return table_html
|
571
|
+
|
572
|
+
|
417
573
|
def _add_table_styling(html_content: str) -> str:
|
418
574
|
"""
|
419
575
|
Add table styling wrapper to HTML content.
|
@@ -884,6 +1040,55 @@ def _get_css_styles() -> str:
|
|
884
1040
|
content: '☀️';
|
885
1041
|
}
|
886
1042
|
|
1043
|
+
/* Dark mode table styles */
|
1044
|
+
[data-theme="dark"] .markdown-table,
|
1045
|
+
[data-theme="dark"] table {
|
1046
|
+
background: var(--card-bg);
|
1047
|
+
border-color: var(--border-color);
|
1048
|
+
}
|
1049
|
+
|
1050
|
+
[data-theme="dark"] .markdown-table th,
|
1051
|
+
[data-theme="dark"] table th {
|
1052
|
+
background: #374151;
|
1053
|
+
color: #f9fafb;
|
1054
|
+
border-bottom-color: var(--accent-color);
|
1055
|
+
}
|
1056
|
+
|
1057
|
+
[data-theme="dark"] .markdown-table td,
|
1058
|
+
[data-theme="dark"] table td {
|
1059
|
+
color: #f9fafb;
|
1060
|
+
border-bottom-color: var(--border-color);
|
1061
|
+
}
|
1062
|
+
|
1063
|
+
[data-theme="dark"] .markdown-table tr:nth-child(even),
|
1064
|
+
[data-theme="dark"] table tr:nth-child(even) {
|
1065
|
+
background: #374151;
|
1066
|
+
}
|
1067
|
+
|
1068
|
+
[data-theme="dark"] .markdown-table tr:hover,
|
1069
|
+
[data-theme="dark"] table tr:hover {
|
1070
|
+
background: #4b5563;
|
1071
|
+
}
|
1072
|
+
|
1073
|
+
/* Dark mode footer styles to match header */
|
1074
|
+
[data-theme="dark"] .footer {
|
1075
|
+
background: var(--primary-color);
|
1076
|
+
color: white;
|
1077
|
+
border-top-color: var(--accent-color);
|
1078
|
+
}
|
1079
|
+
|
1080
|
+
[data-theme="dark"] .footer-brand {
|
1081
|
+
color: white;
|
1082
|
+
}
|
1083
|
+
|
1084
|
+
[data-theme="dark"] .footer a {
|
1085
|
+
color: rgba(255, 255, 255, 0.8);
|
1086
|
+
}
|
1087
|
+
|
1088
|
+
[data-theme="dark"] .footer a:hover {
|
1089
|
+
color: white;
|
1090
|
+
}
|
1091
|
+
|
887
1092
|
/* Professional scrollbar */
|
888
1093
|
::-webkit-scrollbar {
|
889
1094
|
width: 8px;
|